# -*- coding: utf-8 -*-

"""
Datetime: 2019/6/11
Author: Zhang Yafei
Description: 爬虫Spider
"""
import os
import re
from urllib.parse import urljoin

import pandas as pd

from engine import Request
from settings import TO_FILE


class XiaohuaSpider(object):
    """ 自定义Spider类 """
    # 1. 自定义起始url列表
    start_urls = [f'http://www.xiaohuar.com/list-1-{i}.html' for i in range(4)]

    def filter_downloaded_urls(self):
        """ 2. 添加过滤规则 """
        # self.start_urls = self.start_urls
        pass

    def start_request(self):
        """ 3. 将请求加入请求队列（集合），发送请求 """
        for url in self.start_urls:
            yield Request(url=url, callback=self.parse)

    async def parse(self, response):
        """ 4. 拿到请求响应，进行数据解析 """
        html = await response.text(encoding='gbk')
        reg = re.compile('<img width="210".*alt="(.*?)".*src="(.*?)" />')
        results = re.findall(reg, html)
        item_list = []
        request_list = []
        for name, src in results:
            img_url = src if src.startswith('http') else urljoin('http://www.xiaohuar.com', src)
            item_list.append({'name': name, 'img_url': img_url})
            request_list.append(Request(url=img_url, callback=self.download_img, meta={'name': name}))
        # 4.1 进行数据存储
        await self.store_data(data=item_list, url=response.url)
        # 4.2 返回请求和回调函数
        return request_list

    @staticmethod
    async def store_data(data, url):
        """ 5. 数据存储 """
        df = pd.DataFrame(data=data)
        if os.path.exists(TO_FILE):
            df.to_csv(TO_FILE, index=False, mode='a', header=False, encoding='utf_8_sig')
        else:
            df.to_csv(TO_FILE, index=False, encoding='utf_8_sig')
        print(f'{url}\t数据下载完成')

    @staticmethod
    async def download_img(response):
        """ 二层深度下载 """
        name = response.request.meta.get('name')
        with open(f'images/{name}.jpg', mode='wb') as f:
            f.write(await response.read())
        print(f'{name}\t下载成功')
